rm(list=ls())
setwd("E:/data/cooperation/poor/gini")
library(foreign)
library(xlsxjars)
library(xlsx)
library(readstata13)
library(stringr)
library(ggplot2)
library(reshape2)
library(gridExtra)
library(randomForest)
library(e1071)
library(party)
##data mining----
da1<-read.dta13("data_mining.dta")
name<-colnames(da1)
a<-c(which(name=="year"),which(name=="id"))
da<-da1[,-a]
name<-colnames(da)
n<-dim(da)[2]
for(i in 1:dim(da)[2])
{
  da[,i]<-as.numeric(da[,i])
}
a<-vector(length = n)
for (i in 1:n)
{
  a[i]<-sum(is.na(da[,i]))/dim(da)[1]
}
re<-data.frame(name=name,missing=a)
miss_variable<-re[,2]
da_new<-da[,which(re$missing<0.25)]
##main data----
da_new<-data.frame(pov9=da$pov9,da_new)
name<-colnames(da_new)
##corelation ------
a<-1
for (i in 2:length(name))
{
  a<-cbind(a,cor.test(da_new$pov9,da_new[,i])$estimate)
}
a<-a[-1]
a<-data.frame(cor=a)
a<-abs(a)
##country identi----
da<-data.frame(id=da1$id,year=da1$year,da_new)
names(table(da$id))#202country
da2<-split(da,da$id)  #split by country
country<-labels(da2)#country name
re<-colSums(is.na(da2[[1]]))/dim(da2[[1]])[1]
for (i in 2:length(country))
{
  re1<-colSums(is.na(da2[[i]]))/dim(da2[[i]])[1]
  re<-cbind(re,re1)
}
colnames(re)<-country
write.csv(re,file="balance_country.csv")#the missing status of country 
re_country<-re
ba_country<-colMeans(re_country)
sum<-summary(ba_country)
ba<-data.frame(missing=ba_country)
##fig A1-----
miss_variable<-data.frame(prob=miss_variable,label="Missing data at the variable level") ##missing probability of variables
a<-data.frame(prob=a$cor,label="Correlative coefficient between variables and Gini index")  ##corelation between variables and gini index 
ba<-data.frame(prob=ba$missing,label="Missing data at the national level") ##missing probability of country level
da_A1<-rbind(miss_variable,a,ba)
ggplot(da_A1,aes(x=label))+  ##boxplot
  geom_boxplot(aes(y=prob))+
  geom_jitter(aes(y=prob))+
  theme_bw()+
  labs(y="Probability",size=0.1,x="")+
  theme(axis.text.x = element_text(family="Arial", size=8))+
  theme(title=element_text(size=8),legend.position="none")+
  theme(panel.grid=element_blank(),panel.border=element_blank(),plot.title = element_text(hjust = 0.5,size=9),axis.line = element_line())
country_name<-names(which(ba_country<=sum[5]))
write.csv(country_name,file="country.csv")
da_1<-da[which(da$id==country_name[1]),]
for (i in 2:length(country_name))
{
  da_1<-rbind(da_1,da[which(da$id==country_name[i]),])
}
da<-da_1  ###this is the main database 
name<-colnames(da)
write.dta(da,file="da_original.dta")
da_rolling<-da
a<-c(which(name=="year"),which(name=="id"))
da_new<-da[,-a]
num_test<-which(is.na(da_new$pov9))
num_train<-(1:3775)[-num_test]
num_test_original<-num_test
num_train_original<-num_train
da_test<-da_new[which(is.na(da_new$pov9)),]
da_train<-da_new[-which(is.na(da_new$pov9)),]

##decision tree----
library(rpart)
result<-rpart(pov9~.,data=da_train)
res<-prune(result,cp=0.02)
library(rpart.plot)
rpart.plot(res,branch=1,type=1, fallen.leaves=T,cex=0.8,box.palette="-Grays")
##predict----
set.seed(100000)
result<-randomForest(pov9~.,data=da_train,importance=TRUE, na.action=na.omit,ntree=1000)
da_1<-predict(result,da_test)
#da_1<-predict(result,da_test)
da$pov9[num_test]<-da_1   #new data after data mining
da$pov9<-as.numeric(da$pov9)
da_base<-da
write.dta(da_base,file="da_base.dta")##simple data mining


##compare the gini index ----
da<-data.frame(da,decision=1)
da1<-da
da1$decision<-"Total data"
da$decision[num_test]<-"Testing data"
da$decision[num_train]<-"Training data"
da_plot<-rbind(da,da1)
ggplot(da_plot,aes(x=decision))+
  geom_boxplot(aes(y=pov9))+
  geom_jitter(aes(y=pov9))+
  theme_bw()+
  labs(y="Gini index",x=NULL,size=0.1)+
  theme(title=element_text(size=8),legend.position="none")+
  theme(panel.grid=element_blank(),panel.border=element_blank(),plot.title = element_text(hjust = 0.5,size=9),axis.line = element_line())

summary(da_train$pov9)
summary(da_1)
summary(da$pov9)
sd(da_1,na.rm = T)
sd(da$pov9,na.rm = T)

##rolling method----
#da_rolling
name<-names(da_base)
a<-c(which(name=="year"),which(name=="id"))
da_new<-da_rolling[,-a]
num_test<-which(is.na(da_new$pov9))
num_train<-(1:3775)[-num_test]
for (i in 1:26)
{
  n<-sample(num_test,100)
  da_test<-da_new[n,]
  da_train<-da_new[num_train,]
  result<-randomForest(pov9~.,data=da_train,importance=TRUE, na.action=na.omit,ntree=1000)
  da_1<-predict(result,da_test)
  da_new$pov9[n]<-da_1
  da_rolling$pov9[n]<-da_1
  num_test<-num_test[-match(n,num_test)]
  num_train<-c(num_train,n)
}
da_test<-da_new[num_test,]
da_train<-da_new[num_train,]
result<-randomForest(pov9~.,data=da_train,importance=TRUE, na.action=na.omit,ntree=1000)
da_1<-predict(result,da_test)
da_rolling$pov9[num_test]<-da_1   ##da_rolling ,rolling
write.dta(da_rolling,file="da_rolling.dta")

##compare the gini index ----
da_r<-data.frame(da_rolling,decision=1)
da1_r<-da_rolling
da1_r$decision<-"*Total data*"
da_r$decision[num_test_original]<-"*Testing data*"
da_r<-da_r[-num_train_original,]
da_plot_r<-rbind(da_r,da1_r)

summary(da_rolling$pov9[num_train_original])
summary(da_rolling$pov9[num_test_original])
summary(da_rolling$pov9)
sd(da_rolling$pov9[num_test_original],na.rm = T)
sd(da_rolling$pov9,na.rm = T)

##fig A2
da_graph<-rbind(da_plot_r,da_plot)
da_graph<-da_graph[-which(is.na(da_graph$pov9)==1),]
da_graph$decision<-factor(da_graph$decision,levels=c("Training data","Testing data","*Testing data*","Total data","*Total data*"))
ggplot(da_graph,aes(x=decision))+
  geom_boxplot(aes(y=pov9))+
  geom_jitter(aes(y=pov9),width = 0.4,alpha=0.3,stroke=0.1)+
  theme_bw()+
  labs(y="Gini index",x=NULL,size=0.1)+
  theme(title=element_text(size=8),legend.position="none")+
  theme(panel.grid=element_blank(),panel.border=element_blank(),plot.title = element_text(hjust = 0.5,size=9),axis.line = element_line())

##machine learning----
da<-da_rolling
name<-colnames(da)
a<-c(which(name=="year"),which(name=="id"))
da<-da[,-a]
result<-randomForest(pov9~.,data=da,importance=TRUE, na.action=na.omit,ntree=10)
im<-importance(result)
write.csv(im,file="im.csv")

##fig 2-----
da<-read.dta13("data_mining.dta")
a<- floor((da$year-1993)/13)
a<-as.factor(a)
da<-data.frame(da,year1=a)
ggplot(da,aes(x=vdem_egaldem,y=pov9,colour=factor(year1),fill=factor(year1)))+
  geom_point()+
  geom_smooth(method="lm",formula = y ~log(x),size=1)+
  theme_bw()+
  ylim(30,50)+
  scale_color_manual(name="Year",values=c("gray40", "black"),labels=c('1993-2005','2006-2017'))+
  scale_fill_manual(name="Year",values=c("gray40", "black"),labels=c('1993-2005','2006-2017'))+
  theme(title=element_text(size=10))+
  labs(y="Gini index",x="Level of democracy")+
  theme(panel.grid=element_blank(),panel.border=element_blank(),plot.title = element_text(hjust = 0.5),axis.line = element_line())

##data processing----
da_1<-read.dta13("da_original.dta")
da<-read.dta13("data_mining.dta")
da_2<-read.dta13("da_rolling.dta")
country<-read.csv("country.csv",header=T)
da_1$id<-factor(da_1$id,levels=country[,2])
names(table(da_1$id))
country<-labels(da2)
da_1$dpi_system[which(da_1$dpi_system==1)]<-0
da_1$dpi_system[which(da_1$dpi_system==2)]<-1
da_1$dpi_system[which(da_1$dpi_system==3)]<-1
da_1$ht_colonial[which(da_1$ht_colonial==1)]<-0
da_1$ht_colonial[which(da_1$ht_colonial>1)]<-1
save.dta13(da_1,file="da_original.dta")
##as for rolling 
country<-read.csv("country.csv",header=T)
da_2$id<-factor(da_2$id,levels=country[,2])
names(table(da_2$id))
country<-labels(da2)
da_2$dpi_system[which(da_2$dpi_system==1)]<-0
da_2$dpi_system[which(da_2$dpi_system==2)]<-1
da_2$dpi_system[which(da_2$dpi_system==3)]<-1
da_2$ht_colonial[which(da_2$ht_colonial==1)]<-0
da_2$ht_colonial[which(da_2$ht_colonial>1)]<-1
save.dta13(da_2,file="da_rolling.dta")


da<-read.dta13("da_rolling.dta")
mean(da$pov9[which(da$vdem_egaldem<0.07)],na.rm = T)
mean(da$pov9[which(da$vdem_egaldem<0.495&da$vdem_egaldem>0.493)],na.rm = T)

mean(da$pov9[which(da$vdem_egaldem<0.8&da$vdem_egaldem>0.7)],na.rm = T)


